This presentation: http://bit.ly/datavizualisation
13 February 2018
This presentation: http://bit.ly/datavizualisation
library("tidyverse")
## ── Attaching packages ────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4 ## ✔ tibble 1.3.4 ✔ dplyr 0.7.4 ## ✔ tidyr 0.7.2 ✔ stringr 1.2.0 ## ✔ readr 1.1.1 ✔ forcats 0.2.0
## ── Conflicts ───────────────────────────────────── tidyverse_conflicts() ── ## ✖ dplyr::filter() masks stats::filter() ## ✖ dplyr::lag() masks stats::lag()
alt text
Figure from http://r4ds.had.co.nz/tidy-data.html
Following three rules makes a dataset tidy: variables are in columns, observations are in rows, and values are in cells.
mydata <- read_csv("datasets/architect.csv")
## Parsed with column specification: ## cols( ## .default = col_double(), ## X1 = col_integer(), ## FileName = col_character(), ## genotype = col_character(), ## Time = col_integer(), ## TNLR = col_integer(), ## N2LR = col_integer(), ## Magnitude = col_integer(), ## Altitude = col_integer(), ## ExtPathLength = col_integer() ## )
## See spec(...) for full column specifications.
str(mydata)
## Classes 'tbl_df', 'tbl' and 'data.frame': 647 obs. of 20 variables: ## $ X1 : int 1 2 3 4 5 6 7 8 9 10 ... ## $ FileName : chr "dense-3-1-12_1" "dense-3-1-12_1" "dense-3-1-12_1" "dense-3-1-12_1" ... ## $ genotype : chr "dense" "dense" "dense" "dense" ... ## $ Time : int 4 5 6 7 8 9 10 11 12 3 ... ## $ TRL : num 118 236 591 1594 3661 ... ## $ GRTR : num 118 118 354 1004 2067 ... ## $ L1R : num 118 236 354 472 591 ... ## $ GR1R : num 118 118 118 118 118 ... ## $ TNLR : int 0 3 13 22 29 36 44 54 54 0 ... ## $ TLRL : num 0.00 5.72e-05 2.36e+02 1.12e+03 3.07e+03 ... ## $ N2LR : int 0 3 13 22 29 36 44 54 54 0 ... ## $ L2LR : num 0.00 5.72e-05 2.36e+02 1.12e+03 3.07e+03 ... ## $ ML2LR : num 0.00 1.91e-05 1.82e+01 5.10e+01 1.06e+02 ... ## $ GR2L : num 0.00 5.72e-05 2.36e+02 8.86e+02 1.95e+03 ... ## $ D2LR : num 0 0.0127 0.0367 0.0466 0.0491 ... ## $ Height : num 116 231 348 465 581 ... ## $ Width : num 20.6 43.9 107 168.8 233 ... ## $ Magnitude : int 1 4 14 23 30 37 45 55 55 1 ... ## $ Altitude : int 1 4 14 23 30 37 45 55 55 1 ... ## $ ExtPathLength: int 1 13 118 298 494 739 1079 1594 1594 1 ... ## - attr(*, "spec")=List of 2 ## ..$ cols :List of 20 ## .. ..$ X1 : list() ## .. .. ..- attr(*, "class")= chr "collector_integer" "collector" ## .. ..$ FileName : list() ## .. .. ..- attr(*, "class")= chr "collector_character" "collector" ## .. ..$ genotype : list() ## .. .. ..- attr(*, "class")= chr "collector_character" "collector" ## .. ..$ Time : list() ## .. .. ..- attr(*, "class")= chr "collector_integer" "collector" ## .. ..$ TRL : list() ## .. .. ..- attr(*, "class")= chr "collector_double" "collector" ## .. ..$ GRTR : list() ## .. .. ..- attr(*, "class")= chr "collector_double" "collector" ## .. ..$ L1R : list() ## .. .. ..- attr(*, "class")= chr "collector_double" "collector" ## .. ..$ GR1R : list() ## .. .. ..- attr(*, "class")= chr "collector_double" "collector" ## .. ..$ TNLR : list() ## .. .. ..- attr(*, "class")= chr "collector_integer" "collector" ## .. ..$ TLRL : list() ## .. .. ..- attr(*, "class")= chr "collector_double" "collector" ## .. ..$ N2LR : list() ## .. .. ..- attr(*, "class")= chr "collector_integer" "collector" ## .. ..$ L2LR : list() ## .. .. ..- attr(*, "class")= chr "collector_double" "collector" ## .. ..$ ML2LR : list() ## .. .. ..- attr(*, "class")= chr "collector_double" "collector" ## .. ..$ GR2L : list() ## .. .. ..- attr(*, "class")= chr "collector_double" "collector" ## .. ..$ D2LR : list() ## .. .. ..- attr(*, "class")= chr "collector_double" "collector" ## .. ..$ Height : list() ## .. .. ..- attr(*, "class")= chr "collector_double" "collector" ## .. ..$ Width : list() ## .. .. ..- attr(*, "class")= chr "collector_double" "collector" ## .. ..$ Magnitude : list() ## .. .. ..- attr(*, "class")= chr "collector_integer" "collector" ## .. ..$ Altitude : list() ## .. .. ..- attr(*, "class")= chr "collector_integer" "collector" ## .. ..$ ExtPathLength: list() ## .. .. ..- attr(*, "class")= chr "collector_integer" "collector" ## ..$ default: list() ## .. ..- attr(*, "class")= chr "collector_guess" "collector" ## ..- attr(*, "class")= chr "col_spec"
Use dplyr to filter data based on specific values. %>% is called a pipe and allows you to queue up operations.
mydata %>% filter(genotype == "dense") %>% head()
## # A tibble: 6 x 20 ## X1 FileName genotype Time TRL GRTR L1R ## <int> <chr> <chr> <int> <dbl> <dbl> <dbl> ## 1 1 dense-3-1-12_1 dense 4 118.1103 118.1103 118.1103 ## 2 2 dense-3-1-12_1 dense 5 236.2205 118.1103 236.2205 ## 3 3 dense-3-1-12_1 dense 6 590.5513 354.3308 354.3307 ## 4 4 dense-3-1-12_1 dense 7 1594.4887 1003.9373 472.4410 ## 5 5 dense-3-1-12_1 dense 8 3661.4180 2066.9294 590.5511 ## 6 6 dense-3-1-12_1 dense 9 6259.8433 2598.4253 708.6613 ## # ... with 13 more variables: GR1R <dbl>, TNLR <int>, TLRL <dbl>, ## # N2LR <int>, L2LR <dbl>, ML2LR <dbl>, GR2L <dbl>, D2LR <dbl>, ## # Height <dbl>, Width <dbl>, Magnitude <int>, Altitude <int>, ## # ExtPathLength <int>
You can also select, or drop specific columns using the select verb.
mydata %>% select(c(FileName, genotype, Time, Height, Width))%>% head()
## # A tibble: 6 x 5 ## FileName genotype Time Height Width ## <chr> <chr> <int> <dbl> <dbl> ## 1 dense-3-1-12_1 dense 4 115.9124 20.61023 ## 2 dense-3-1-12_1 dense 5 231.4529 43.94336 ## 3 dense-3-1-12_1 dense 6 348.0965 106.99701 ## 4 dense-3-1-12_1 dense 7 465.2197 168.76273 ## 5 dense-3-1-12_1 dense 8 581.1602 232.97046 ## 6 dense-3-1-12_1 dense 9 698.4634 289.51532
You can also select, or drop specific columns using the select verb.
mydata <- mydata %>% select(-c(X1)) mydata %>% head()
## # A tibble: 6 x 19 ## FileName genotype Time TRL GRTR L1R GR1R ## <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl> ## 1 dense-3-1-12_1 dense 4 118.1103 118.1103 118.1103 118.1103 ## 2 dense-3-1-12_1 dense 5 236.2205 118.1103 236.2205 118.1102 ## 3 dense-3-1-12_1 dense 6 590.5513 354.3308 354.3307 118.1103 ## 4 dense-3-1-12_1 dense 7 1594.4887 1003.9373 472.4410 118.1102 ## 5 dense-3-1-12_1 dense 8 3661.4180 2066.9294 590.5511 118.1102 ## 6 dense-3-1-12_1 dense 9 6259.8433 2598.4253 708.6613 118.1102 ## # ... with 12 more variables: TNLR <int>, TLRL <dbl>, N2LR <int>, ## # L2LR <dbl>, ML2LR <dbl>, GR2L <dbl>, D2LR <dbl>, Height <dbl>, ## # Width <dbl>, Magnitude <int>, Altitude <int>, ExtPathLength <int>
And you can create new variables using the mutate verb.
mydata %>% mutate(newvar = log(TRL))%>% select(c(genotype, Time, newvar)) %>% head()
## # A tibble: 6 x 3 ## genotype Time newvar ## <chr> <int> <dbl> ## 1 dense 4 4.771619 ## 2 dense 5 5.464766 ## 3 dense 6 6.381057 ## 4 dense 7 7.374308 ## 5 dense 8 8.205606 ## 6 dense 9 8.741910
alt text
ggplot ?Used to produce statistical graphics, main developer = Hadley Wickham
attempt to take the good things about base and lattice graphics and improve on them with a strong, underlying model "
based on The Grammar of Graphics by Leland Wilkinson, 2005
describes the meaning of what we do when we construct statistical graphics … More than a taxonomy … Computational system based on the underlying mathematics of representing statistical functions of data.
ggplot componentsdata: in ggplot2, data must be stored as an R data framecoordinate system: describes 2-D space that data is projected ontogeoms: describe type of geometric objects that represent dataaesthetics: describe visual characteristics that represent datascales: for each aesthetic, describe how visual characteristic is converted to display valuesstats: describe statistical transformations that typically summarize datafacets: describe how data is split into subsets and displayed as multiple small graphsdata and aesteticWe first create the plot, by setting the data and the aestetic.
myplot <- ggplot(data=mydata, aes(x=TRL, y=TNLR)) myplot
geometry - PointsWe need to add a geom to display the plot. Different geom can be used.
myplot + geom_point()
geometry - LinesWe need to add a geom to display the plot. Different geom can be used.
myplot + geom_line()
geometry - stepsWe need to add a geom to display the plot. Different geom can be used.
myplot + geom_step()
geomsThe advantage of using a layered approach, is that the layers can be combined. For instance, several geom can be used in the same plot.
myplot + geom_point() + geom_step()
Let's use the different categories we have. For this, we add a colour argument in the aes. ggplot will automatically pick a discrete color scale.
ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=genotype)) + geom_point()
Let's use one of the variable as a continuous caterogy. For this, we add a colour argument in the aes. ggplot will automatically pick a continuous color scale.
ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=Height)) + geom_point()
When using multiple categories, we can both use define different colors (colour) and point styles (shape) in the aes argument.
ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=Time, shape=genotype)) + geom_point()
ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=Time, shape=genotype)) + geom_point() + theme_classic()
facetsFacets can be used to split the data and present them side to side.
ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=genotype)) + geom_point() + facet_wrap(~ genotype)
stats tot the plotsggplot has some built-in stat functions that can be directly used in the plots.
ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=genotype)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'loess'
stats tot the plotsggplot has some built-in stat functions that can be directly used in the plots.
ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=genotype)) + geom_point() + geom_smooth(se = FALSE)
stats tot the plotsggplot has some built-in stat functions that can be directly used in the plots.
ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=genotype)) + geom_point() + geom_smooth(se = FALSE, method="lm")
stats tot the plotsggplot has some built-in stat functions that can be directly used in the plots.
ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=genotype)) + geom_point() + stat_ellipse()
ggplot(data=mydata, aes(x=genotype, y=TRL)) + geom_boxplot()
Now, can also try to see the data at once. For this, we need to play a bit with the initial data, to move from a wide to long format.
mydatalong <- mydata %>% gather(TRL:ExtPathLength, key = "variable", value = "value") mydatalong %>% head()
## # A tibble: 6 x 5 ## FileName genotype Time variable value ## <chr> <chr> <int> <chr> <dbl> ## 1 dense-3-1-12_1 dense 4 TRL 118.1103 ## 2 dense-3-1-12_1 dense 5 TRL 236.2205 ## 3 dense-3-1-12_1 dense 6 TRL 590.5513 ## 4 dense-3-1-12_1 dense 7 TRL 1594.4887 ## 5 dense-3-1-12_1 dense 8 TRL 3661.4180 ## 6 dense-3-1-12_1 dense 9 TRL 6259.8433
# Make a lin plot, for each variable mydatalong %>% filter(Time == max(Time)) %>% # Select just one time point (the last) ggplot(aes(genotype, value, colour=genotype)) + geom_boxplot() + facet_wrap(~variable, nrow=2)+ theme(text = element_text(size=9))
# Make a lin plot, for each variable mydatalong %>% filter(Time == max(Time)) %>% # Select just one time point (the last) ggplot(aes(genotype, value, colour=genotype)) + geom_boxplot() + facet_wrap(~variable, nrow=2, scales = "free")+ theme(text = element_text(size=9))
plotly libraryPlotly creates leading open source tools for composing, editing, and sharing interactive data visualization via the Web.
library("plotly")
plotly works great with ggplotAny type of ggplot can be wrapped up in a plotly figure
pl <- ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=genotype)) + geom_point() ggplotly(pl)
plotly works great with ggplotWe can choose what type of label we want to see
pl <- ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=genotype, label=FileName)) + geom_point() ggplotly(pl)
plotly works great with ggplotpl <- mydata %>% filter(Time == max(Time)) %>% ggplot(aes(x=genotype, y=TRL, label=FileName)) + geom_boxplot() ggplotly(pl )
Google is your friend !
This presentation: http://bit.ly/datavizualisation
http://seananderson.ca/ggplot2-FISH554/
http://blog.echen.me/2012/01/17/quick-introduction-to-ggplot2/
http://tutorials.iq.harvard.edu/R/Rgraphics/Rgraphics.html
http://zevross.com/blog/2014/08/04/beautiful-plotting-in-r-a-ggplot2-cheatsheet-3